Importing Libraries¶

In [1]:
#Importing Libraries for ML

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy.stats import ttest_ind

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.feature_selection import mutual_info_classif

import warnings
warnings.filterwarnings("ignore")

Importing Cleaned Data¶

In [2]:
df = pd.read_csv('cleaned_data.csv')
df.head(3)
Out[2]:
name MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer ... Shimmer:DDA NHR HNR status RPDE DFA spread1 spread2 D2 PPE
0 0 119.992 157.302 74.997 0.00784 0.00007 0.00370 0.00554 0.01109 0.04374 ... 0.06545 0.02211 21.033 1.0 0.414783 0.815285 -4.813031 0.266482 2.301442 0.284654
1 1 122.400 148.650 113.819 0.00968 0.00008 0.00465 0.00696 0.01394 0.06134 ... 0.09403 0.01929 19.085 1.0 0.458359 0.819521 -4.075192 0.335590 2.486855 0.368674
2 2 116.682 131.111 111.555 0.01050 0.00009 0.00544 0.00781 0.01633 0.05233 ... 0.08270 0.01309 20.651 1.0 0.429895 0.825288 -4.443179 0.311173 2.342259 0.332634

3 rows × 24 columns

Data Exploration (EDA)¶

  1. Univariate analysis on numerical data
  2. Bivariate analysis on numerical data vs numerical data
  3. Multivariate analysis on numerical data

Univariate analysis on numerical data¶

Descriptive Statistics¶

In [3]:
# Function to create dataframe for columns and its unique count of values, datatypes and type of columns
def find_categorical_continous_variables(data, threashold):
    dic = {}
    continus_cols = []
    categorical_cols = []

    for column in data.columns:
        li = []
        li.append(len(data[column].unique()))
        li.append(data[column].dtype)

        # Check if the column is categorical or continuous
        if len(data[column].unique()) <= threashold or data[column].dtype == 'O':
            li.append("Categorical column")
            categorical_cols.append(column)
        else:
            li.append("Continous Column")
            continus_cols.append(column)

        dic[column] = li
    
    # create dataframe for columns and its details
    dic_df = pd.DataFrame(dic, index=["Unique values", "Data Type", "Categorical/Continous"])
    
    # return the dataframe, and lists for continous and categorical columns
    return dic_df.T, categorical_cols, continus_cols

# Function to find descriptive statistics by providing it the contious cols and dataframe
def descriptive_statistics_continous(df, continus_cols):
    dic={}
    for col in continus_cols:
        dic[col] = []
        dic[col].append(df[col].mean())
        dic[col].append(df[col].median())
        dic[col].append(df[col].mode()[0])
        dic[col].append(df[col].std())
        dic[col].append(df[col].var())
        dic[col].append(df[col].max() - df[col].min())
        dic[col].append(df[col].quantile(0.75) - df[col].quantile(0.25) )
        dic[col].append(df[col].skew())
        dic[col].append(df[col].kurtosis())

    df_details = pd.DataFrame(dic, index=["Mean", "Median", "Mode", "Std Deviation", "Variance", "Range", "IQR", "Skew", "Kurtosis"])
    return df_details.T
In [4]:
# lets find out the unique counts, datatypes, variable type like continous/categorical and lists for columns names having continous/categorical columns
col_type_df, categorical_cols, continus_cols = find_categorical_continous_variables(df, 10)
In [5]:
# display the data
col_type_df
Out[5]:
Unique values Data Type Categorical/Continous
name 189 int64 Continous Column
MDVP:Fo(Hz) 189 float64 Continous Column
MDVP:Fhi(Hz) 189 float64 Continous Column
MDVP:Flo(Hz) 189 float64 Continous Column
MDVP:Jitter(%) 168 float64 Continous Column
MDVP:Jitter(Abs) 18 float64 Continous Column
MDVP:RAP 149 float64 Continous Column
MDVP:PPQ 162 float64 Continous Column
Jitter:DDP 174 float64 Continous Column
MDVP:Shimmer 182 float64 Continous Column
MDVP:Shimmer(dB) 145 float64 Continous Column
Shimmer:APQ3 180 float64 Continous Column
Shimmer:APQ5 183 float64 Continous Column
MDVP:APQ 183 float64 Continous Column
Shimmer:DDA 184 float64 Continous Column
NHR 181 float64 Continous Column
HNR 189 float64 Continous Column
status 2 float64 Categorical column
RPDE 189 float64 Continous Column
DFA 189 float64 Continous Column
spread1 189 float64 Continous Column
spread2 188 float64 Continous Column
D2 189 float64 Continous Column
PPE 189 float64 Continous Column
In [6]:
# print columns 
print("Categorical Columns -\n",categorical_cols, "\n")
print("Continous Columns -\n",continus_cols)
Categorical Columns -
 ['status'] 

Continous Columns -
 ['name', 'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']

Conclusion -¶

  1. We have two categorical variables i.e. ['name', 'status']

  2. We have 22 Continous variables i.e. ['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']

In [7]:
# Get the descriptive statistics 
descriptive_statistics_continous(df, continus_cols)
Out[7]:
Mean Median Mode Std Deviation Variance Range IQR Skew Kurtosis
name 94.000000 94.000000 0.000000 54.703748 2.992500e+03 188.000000 94.000000 0.000000 -1.200000
MDVP:Fo(Hz) 154.605011 148.790000 88.333000 41.525291 1.724350e+03 171.772000 65.557000 0.592528 -0.633226
MDVP:Fhi(Hz) 198.026812 176.595000 102.145000 92.490876 8.554562e+03 489.885000 88.913000 2.515378 7.401087
MDVP:Flo(Hz) 115.305561 102.874000 1.000000 44.653702 1.993953e+03 238.170000 51.701000 1.130111 0.734658
MDVP:Jitter(%) 0.006093 0.004940 0.003690 0.004558 2.077129e-05 0.031480 0.003810 3.050555 12.379222
MDVP:Jitter(Abs) 0.000043 0.000030 0.000030 0.000034 1.157854e-09 0.000253 0.000030 2.775989 12.202619
MDVP:RAP 0.003236 0.002470 0.001690 0.002807 7.878800e-06 0.020760 0.002070 3.408734 15.450912
MDVP:PPQ 0.066817 0.002630 0.003320 0.872632 7.614869e-01 12.000000 0.002040 13.747551 188.996748
Jitter:DDP 0.009688 0.007420 0.005070 0.008418 7.085477e-05 0.062290 0.006180 3.420839 15.523735
MDVP:Shimmer 0.029264 0.022960 0.014500 0.017908 3.207078e-04 0.084650 0.021150 1.420081 1.657132
MDVP:Shimmer(dB) 0.282175 0.221000 0.154000 0.189523 3.591883e-02 0.933000 0.201000 1.704636 2.910687
Shimmer:APQ3 0.015492 0.012770 0.004690 0.009823 9.649054e-05 0.050830 0.012010 1.475233 2.208598
Shimmer:APQ5 0.017617 0.013430 0.007470 0.011290 1.274623e-04 0.049860 0.012740 1.446952 1.428675
MDVP:APQ 0.023583 0.017990 0.009030 0.015002 2.250460e-04 0.080890 0.016220 1.660942 2.946433
Shimmer:DDA 0.046476 0.038310 0.015670 0.029468 8.683737e-04 0.152510 0.036020 1.475234 2.208833
NHR 0.023823 0.011660 0.002310 0.038322 1.468603e-03 0.314170 0.018900 4.505491 25.951523
HNR 21.961164 22.085000 8.867000 4.268406 1.821929e+01 24.180000 5.763000 -0.445051 0.571981
RPDE 0.497594 0.491345 0.256570 0.103133 1.063635e-02 0.428581 0.163781 -0.129763 -0.909410
DFA 0.717117 0.722085 0.574282 0.054975 3.022233e-03 0.251006 0.086725 -0.039189 -0.663940
spread1 -5.725437 -5.736781 -7.964984 1.051205 1.105032e+00 5.125228 1.320026 0.347205 -0.210744
spread2 0.223666 0.217013 0.210279 0.081229 6.598211e-03 0.428052 0.099760 0.074474 -0.130574
D2 2.376950 2.361532 1.423287 0.368745 1.359730e-01 1.990362 0.531619 0.312325 -0.072176
PPE 0.203052 0.192730 0.044539 0.086461 7.475510e-03 0.412994 0.113350 0.721347 0.261612

Conclusion -¶

We can see the descriptive statistical inferences like above

In [8]:
def plot_distribution_numerical(df):
    # Lets plot the histogram for each variable
    features = df.select_dtypes(include='number').columns

    # plot the subplot for histogram of each variable
    fig, axs = plt.subplots(23, 3, figsize=(20, 60))
    row = 0
    for feature in features:

        axs[row, 0].set_title("Histogram for {}".format(feature))
        sns.histplot(data=df, x=feature, kde=True, color="red", ax=axs[row, 0])

        axs[row, 1].set_title("Boxplot for {}".format(feature))
        sns.boxplot(data=df, x=feature,  color="skyblue", ax=axs[row, 1])

        axs[row, 2].set_title("Vaiolinplot for {}".format(feature))
        sns.violinplot(data=df, x=feature,  color="lightgreen", ax=axs[row, 2])
        row = row+1

    plt.title("Histogram, Boxplot and Violinplots for all variables")
    plt.tight_layout()
    plt.show()
    
plot_distribution_numerical(df)
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[8], line 24
     21     plt.tight_layout()
     22     plt.show()
---> 24 plot_distribution_numerical(df)

Cell In[8], line 10, in plot_distribution_numerical(df)
      7 row = 0
      8 for feature in features:
---> 10     axs[row, 0].set_title("Histogram for {}".format(feature))
     11     sns.histplot(data=df, x=feature, kde=True, color="red", ax=axs[row, 0])
     13     axs[row, 1].set_title("Boxplot for {}".format(feature))

IndexError: index 23 is out of bounds for axis 0 with size 23
No description has been provided for this image

Conclusion -¶

  • As per the skewness, kurtosis and above histogram, boxplots and violinplots we can see that below columns are normally distributed -

DFA, spread1, spread2, D2 and PPE

  • Below columns are skewed to the right

'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA'

Bivariate analysis on numerical data¶

In [9]:
# Covariance for pair of variables
def covariance_of_pair_variables(df):
    features = df.select_dtypes(include='number').columns
    dic_covariance = {}

    for index1 in range(0, len(features)):
        for index2 in range(index1+1, len(features)):
            dic_covariance[features[index1] + " - " + features[index2]] = df[features[index1]].cov(df[features[index2]])
            
    covariance_df = pd.DataFrame(dic_covariance, index=["Covariance"]).T
    return covariance_df.sort_values(by='Covariance')


covariance_of_pair_variables(df)
Out[9]:
Covariance
name - MDVP:Flo(Hz) -143.088761
name - HNR -48.473436
MDVP:Fo(Hz) - spread1 -19.036930
MDVP:Flo(Hz) - spread1 -18.886935
MDVP:Flo(Hz) - status -7.539918
... ...
name - MDVP:Fo(Hz) 81.688888
MDVP:Fhi(Hz) - MDVP:Flo(Hz) 351.507050
name - MDVP:Fhi(Hz) 706.529244
MDVP:Fo(Hz) - MDVP:Flo(Hz) 1081.942775
MDVP:Fo(Hz) - MDVP:Fhi(Hz) 1511.889706

276 rows × 1 columns

Conclusion -¶

Below pair of variables are having the lowest covariance -

  • MDVP:Fo(Hz) - spread1
  • MDVP:Flo(Hz) - spread1
  • MDVP:Fhi(Hz) - spread1
  • MDVP:Flo(Hz) - status
  • MDVP:Fo(Hz) - status

Below pair of variables are having the highest covariance -

  • MDVP:Fo(Hz) - HNR
  • MDVP:Flo(Hz) - HNR
  • MDVP:Fhi(Hz) - MDVP:Flo(Hz)
  • MDVP:Fo(Hz) - MDVP:Flo(Hz)
  • MDVP:Fo(Hz) - MDVP:Fhi(Hz)
In [10]:
# Correlations matrix
plt.figure(figsize=(15, 15))
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True, cmap='coolwarm')
Out[10]:
<Axes: >
No description has been provided for this image
In [11]:
# plotting the scatterplots for all variables combinations 
sns.pairplot(df)
Out[11]:
<seaborn.axisgrid.PairGrid at 0x1bb42c42dd0>
No description has been provided for this image

Conclusions -¶

As per the scatterplot and heatmap presented we can see that below variables are highly correlated to each other

  • 'MDVP:Fo(Hz)' => 'MDVP:Fhi(Hz)'
  • 'MDVP:Fo(Hz)' => 'MDVP:Flo(Hz)'
  • 'MDVP:Flo(Hz)' => 'MDVP:Fhi(Hz)'
  • 'MDVP:RAP' => 'MDVP:Jitter(%)'
  • 'MDVP:Jitter(%)'=> 'Jitter:DDP'
  • 'MDVP:Jitter(%)'=> 'MDVP:PPQ'
  • 'MDVP:Jitter(%)'=> 'MDVP:RAP'
  • 'MDVP:Jitter(%)'=> 'NHR'
  • 'MDVP:Shimmer' => 'MDVP:Shimmer',
  • 'MDVP:Shimmer' => 'MDVP:Shimmer(dB)',
  • 'MDVP:Shimmer' => 'Shimmer:DDA',
  • 'MDVP:Shimmer' => 'Shimmer:APQ5',
  • 'MDVP:Shimmer' => 'NHR',
  • 'MDVP:Shimmer' => 'Shimmer:APQ3',
  • 'MDVP:Shimmer' => 'HNR'
  • 'MDVP:Shimmer' => 'MDVP:APQ',
In [12]:
def t_test_for_two_groups(df, target_var, first_target, second_target):
    for feature in df.select_dtypes(include='number').columns:
        # Lets have two groups one for datapoints when "status is 0" and another when "status is 1"
        group_1 = df[df[target_var]==first_target][feature]
        group_0 = df[df[target_var]==second_target][feature]

        # Perform independent samples t-test
        t_statistic, p_value = ttest_ind(group_1, group_0)

        # Print the results
        print("\n---------{} for two outcomes of {}-------------\n".format(feature, target_var))
        print("T-statistic:", t_statistic)
        print("P-value:", p_value)

        # Interpret the results
        alpha = 0.05
        if p_value < alpha:
            print("Reject the null hypothesis: There is a significant difference in means of the two data groups of {} when {} is {} and {}".format(feature, target_var, first_target, second_target))
        else:
            print("Fail to reject the null hypothesis: There is NO significant difference in means of the two data groups of {} when {} is {} and {}".format(feature, target_var, first_target, second_target))
            
            
t_test_for_two_groups(df, 'status', 1, 0)
---------name for two outcomes of status-------------

T-statistic: -2.618996447591034
P-value: 0.009542284438269866
Reject the null hypothesis: There is a significant difference in means of the two data groups of name when status is 1 and 0

---------MDVP:Fo(Hz) for two outcomes of status-------------

T-statistic: -5.447839322812917
P-value: 1.592227189146496e-07
Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Fo(Hz) when status is 1 and 0

---------MDVP:Fhi(Hz) for two outcomes of status-------------

T-statistic: -2.105966006139477
P-value: 0.03654192080671688
Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Fhi(Hz) when status is 1 and 0

---------MDVP:Flo(Hz) for two outcomes of status-------------

T-statistic: -5.692141471654098
P-value: 4.784923852241762e-08
Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Flo(Hz) when status is 1 and 0

---------MDVP:Jitter(%) for two outcomes of status-------------

T-statistic: 4.0859747918809335
P-value: 6.506825054289377e-05
Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Jitter(%) when status is 1 and 0

---------MDVP:Jitter(Abs) for two outcomes of status-------------

T-statistic: 4.8761944285090975
P-value: 2.3037111641167024e-06
Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Jitter(Abs) when status is 1 and 0

---------MDVP:RAP for two outcomes of status-------------

T-statistic: 3.8768349749731166
P-value: 0.0001463942426461765
Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:RAP when status is 1 and 0

---------MDVP:PPQ for two outcomes of status-------------

T-statistic: 0.6027388372559829
P-value: 0.5474126882565669
Fail to reject the null hypothesis: There is NO significant difference in means of the two data groups of MDVP:PPQ when status is 1 and 0

---------Jitter:DDP for two outcomes of status-------------

T-statistic: 3.856292612294128
P-value: 0.0001582581388863351
Reject the null hypothesis: There is a significant difference in means of the two data groups of Jitter:DDP when status is 1 and 0

---------MDVP:Shimmer for two outcomes of status-------------

T-statistic: 5.7197069054907725
P-value: 4.168890849126019e-08
Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Shimmer when status is 1 and 0

---------MDVP:Shimmer(dB) for two outcomes of status-------------

T-statistic: 5.5083786926335705
P-value: 1.1858552827842587e-07
Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Shimmer(dB) when status is 1 and 0

---------Shimmer:APQ3 for two outcomes of status-------------

T-statistic: 5.300447394450554
P-value: 3.2329358091596845e-07
Reject the null hypothesis: There is a significant difference in means of the two data groups of Shimmer:APQ3 when status is 1 and 0

---------Shimmer:APQ5 for two outcomes of status-------------

T-statistic: 5.5091649170951404
P-value: 1.1813090095950779e-07
Reject the null hypothesis: There is a significant difference in means of the two data groups of Shimmer:APQ5 when status is 1 and 0

---------MDVP:APQ for two outcomes of status-------------

T-statistic: 6.077737643304644
P-value: 6.6971916263062925e-09
Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:APQ when status is 1 and 0

---------Shimmer:DDA for two outcomes of status-------------

T-statistic: 5.300281352950314
P-value: 3.2354923539354584e-07
Reject the null hypothesis: There is a significant difference in means of the two data groups of Shimmer:DDA when status is 1 and 0

---------NHR for two outcomes of status-------------

T-statistic: 2.647959426829298
P-value: 0.00878949536615006
Reject the null hypothesis: There is a significant difference in means of the two data groups of NHR when status is 1 and 0

---------HNR for two outcomes of status-------------

T-statistic: -5.409563773613764
P-value: 1.9161398150073093e-07
Reject the null hypothesis: There is a significant difference in means of the two data groups of HNR when status is 1 and 0

---------status for two outcomes of status-------------

T-statistic: inf
P-value: 0.0
Reject the null hypothesis: There is a significant difference in means of the two data groups of status when status is 1 and 0

---------RPDE for two outcomes of status-------------

T-statistic: 4.350125625164272
P-value: 2.2342434057464583e-05
Reject the null hypothesis: There is a significant difference in means of the two data groups of RPDE when status is 1 and 0

---------DFA for two outcomes of status-------------

T-statistic: 3.319606465498123
P-value: 0.0010836705446395163
Reject the null hypothesis: There is a significant difference in means of the two data groups of DFA when status is 1 and 0

---------spread1 for two outcomes of status-------------

T-statistic: 9.492051691133758
P-value: 1.0853373799969717e-17
Reject the null hypothesis: There is a significant difference in means of the two data groups of spread1 when status is 1 and 0

---------spread2 for two outcomes of status-------------

T-statistic: 7.067310887483291
P-value: 3.0381818141581874e-11
Reject the null hypothesis: There is a significant difference in means of the two data groups of spread2 when status is 1 and 0

---------D2 for two outcomes of status-------------

T-statistic: 5.3974150453986125
P-value: 2.0317665984241093e-07
Reject the null hypothesis: There is a significant difference in means of the two data groups of D2 when status is 1 and 0

---------PPE for two outcomes of status-------------

T-statistic: 8.695379609608944
P-value: 1.7455826911011754e-15
Reject the null hypothesis: There is a significant difference in means of the two data groups of PPE when status is 1 and 0

Multivariate analysis on numerical data¶

In [14]:
from sklearn import datasets, decomposition 
# Split the DataFrame into X and Y datasets

# Features (independent variables)
X = df.drop(columns=['status', 'name'])  

# Target variable (dependent variable)
y = df['status']  
In [15]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

#Combine PCA Components and Target Variable
df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
df['Target'] = y
In [16]:
#Apply Clustering Algorithm
kmeans = KMeans(n_clusters=2, random_state=42)
df['Cluster'] = kmeans.fit_predict(df[['PC1', 'PC2', 'Target']])
In [17]:
# Visualize clustering results
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', style='Target', palette='viridis')
plt.title('PCA Clustering based on Target Variable')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.show()
No description has been provided for this image